library(limma)
library(Glimma)
library(edgeR)
library(dplyr)
library(magrittr)
setwd('~/Desktop/USC/Fall_2020/TRGN_510/510_FinalProject/adenocarcinoma_vs_squamous_cell_carcinoma')
# Reading in count data
count_matrix = read.table(file = 'count_processed.txt',sep = '\t',header = T,stringsAsFactors = F, check.names = F, row.names = 1)
count_matrix = count_matrix[,order(names(count_matrix))]
#Reading in clinical metadata
clinical_metadata = read.table(file = 'clinical_processed.txt', sep = '\t', header = T, as.is = T, check.names = F)
clinical_metadata = clinical_metadata[!duplicated(clinical_metadata[ , c("case_submitter_id")]),]
# Reading in exposure metadata
exposure_metadata = read.csv(file = 'exposure_processed.txt', sep = '\t', header = T, as.is = T, check.names = F)
# Merging metadata
all_metadata = merge(x = clinical_metadata, y = exposure_metadata, by = 'case_submitter_id')
all_metadata$ethnicity <- as.factor(all_metadata$ethnicity)
all_metadata$gender <- as.factor(all_metadata$gender)
all_metadata%<>%
mutate(diagnosis=case_when(
primary_diagnosis %in% c("Adenocarcinoma with mixed subtypes","Adenocarcinoma, NOS","Bronchiolo-alveolar adenocarcinoma, NOS","Papillary adenocarcinoma, NOS") ~ c("Adenocarcinoma"),
primary_diagnosis %in% c("Squamous cell carcinoma, large cell, nonkeratinizing, NOS","Squamous cell carcinoma, NOS") ~ c("Squamous_cell_carcinoma")
))
all_metadata$diagnosis = as.factor(all_metadata$diagnosis)
all_metadata$race <- as.factor(all_metadata$race)
rownames(all_metadata) <- all_metadata$case_submitter_id
all_metadata = all_metadata[,-1]
all_metadata = all_metadata[order(rownames(all_metadata)),]
# Creating DGEList Object
geneExpr = DGEList(counts = count_matrix, samples = all_metadata)
geneExpr$samples$group=all_metadata$diagnosis
geneid = rownames(geneExpr)
# Importing genecode reference to map annotation from DGC website
gencode = read.table('gencode.gene.info.v22.tsv',sep = '\t',header = T,stringsAsFactors = F, check.names = F)
genes = gencode[geneid %in% gencode$gene_id, ]
genes = genes[,c("gene_id","gene_name","seqname")]
rownames(genes) = genes$gene_id
genes = genes[order(genes$gene_id),]
geneExpr$genes = genes[,-1]
cpm <- cpm(geneExpr)
lcpm <- cpm(geneExpr, log=TRUE)
L <- mean(geneExpr$samples$lib.size) * 1e-6
M <- median(geneExpr$samples$lib.size) * 1e-6
c(L,M)
[1] 52.92581 48.81745
The average library size for this dataset is about 51.9 million.
table(rowSums(geneExpr$counts==0)==40)
FALSE TRUE
53740 6743
keep.exprs <- filterByExpr(geneExpr, group=geneExpr$samples$group)
Around 11% of genes in the dataset have zero counts across all 40 samples.
geneExpr <- geneExpr[keep.exprs,, keep.lib.sizes=FALSE]
dim(geneExpr)
[1] 21170 40
In this dataset, the median library size is 48.8 million and 10/48.8 is about 0.2, therefore the filterByExpr function keeps genes that have a CPM of 0.2 or more. With this cutoff the number of genes are reduced to 21170, about 35% of genes from what I started with.
lcpm.cutoff <- log2(10/M + 2/L)
library(RColorBrewer)
nsamples <- ncol(geneExpr)
col <- brewer.pal(nsamples, "Paired")
par(mfrow=c(1,2))
plot(density(lcpm[,1]), col=col[1], lwd=2, ylim=c(0,0.7), las=2, main="", xlab="")
title(main="A. Raw data", xlab="Log-cpm")
abline(v=lcpm.cutoff, lty=3)
for (i in 2:nsamples){
den <- density(lcpm[,i])
lines(den$x, den$y, col=col[i], lwd=2)
}
legend("topright", colnames(geneExpr$counts), text.col=col, bty="n")
lcpm_filtered <- cpm(geneExpr, log=TRUE)
plot(density(lcpm_filtered[,1]), col=col[1], lwd=2, ylim=c(0,0.26), las=2, main="", xlab="")
title(main="B. Filtered data", xlab="Log-cpm")
abline(v=lcpm.cutoff, lty=3)
for (i in 2:nsamples){
den <- density(lcpm_filtered[,i])
lines(den$x, den$y, col=col[i], lwd=2)
}
legend("topright", colnames(geneExpr$counts), text.col=col, bty="n")
This graph showed that before filtering the data, a large portion of genes within each samples are lowly-expressed with log-CPM values that are small or negative.
geneExpr = calcNormFactors(geneExpr, method = "TMM")
geneExpr$samples$norm.factors
[1] 1.0391573 0.9244982 0.8856329 1.0521559 1.0145085 1.1196887 1.0727778 1.3600421 1.0171641 0.9386345 0.8890663 0.6747847 0.9553945 1.1161617
[15] 0.9624960 1.1869816 1.0062698 0.8520753 0.8883963 1.0043621 0.8191521 1.0129474 0.9248736 1.1260210 1.0694870 0.7173413 1.0284701 1.0027376
[29] 1.0325592 1.1019957 1.0554783 0.9856847 1.0846584 1.0126931 1.1959792 0.9179716 1.1264297 1.0857329 1.0489496 1.0082920
geneExpr_unnormalized <- geneExpr
geneExpr_unnormalized$samples$norm.factors <- 1
par(mfrow=c(1,2))
lcpm_unnormalized <- cpm(geneExpr_unnormalized, log=TRUE)
boxplot(lcpm_unnormalized, las=2, col=col, main="")
title(main="A. Example: Unnormalised data",ylab="Log-cpm")
geneExpr_normalized <- calcNormFactors(geneExpr_unnormalized)
geneExpr_normalized$samples$norm.factors
[1] 1.0391573 0.9244982 0.8856329 1.0521559 1.0145085 1.1196887 1.0727778 1.3600421 1.0171641 0.9386345 0.8890663 0.6747847 0.9553945 1.1161617
[15] 0.9624960 1.1869816 1.0062698 0.8520753 0.8883963 1.0043621 0.8191521 1.0129474 0.9248736 1.1260210 1.0694870 0.7173413 1.0284701 1.0027376
[29] 1.0325592 1.1019957 1.0554783 0.9856847 1.0846584 1.0126931 1.1959792 0.9179716 1.1264297 1.0857329 1.0489496 1.0082920
lcpm_normalized <- cpm(geneExpr_normalized, log=TRUE)
boxplot(lcpm_normalized, las=2, col=col, main="")
title(main="B. Example: Normalised data",ylab="Log-cpm")
This figure showed that after normalization the samples are more similar to each other.
lcpm <- cpm(geneExpr, log=TRUE)
par(mfrow=c(1,2))
col.group <- geneExpr$samples$group
levels(col.group) <- brewer.pal(nlevels(col.group), "Set1")
col.group <- as.character(col.group)
col.race <- geneExpr$samples$race
levels(col.race) <- brewer.pal(nlevels(col.race), "Set2")
col.race <- as.character(col.race)
col.gender <- geneExpr$samples$gender
levels(col.gender) <- brewer.pal(nlevels(col.gender), "Set3")
col.gender <- as.character(col.gender)
col.age <- geneExpr$samples$age_at_diagnosis
levels(col.age) <- brewer.pal(nlevels(col.age), "Set1")
col.age <- as.character(col.age)
col.ethnicity <- geneExpr$samples$ethnicity
levels(col.ethnicity) <- brewer.pal(nlevels(col.ethnicity), "Set2")
col.ethnicity <- as.character(col.ethnicity)
col.cigar_per_day <- geneExpr$samples$cigarettes_per_day
levels(col.cigar_per_day) <- brewer.pal(nlevels(col.cigar_per_day), "Set3")
col.cigar_per_day <- as.character(col.cigar_per_day)
plotMDS(lcpm,labels=geneExpr$samples$group, col=col.group)
title(main="A. Sample groups")
plotMDS(lcpm, labels=geneExpr$samples$gender, col=col.gender, dim=c(3,4))
title(main="B. Sex")
plotMDS(lcpm, labels=geneExpr$samples$age_at_diagnosis, col=col.age, dim=c(3,4))
title(main="C. Age at diagnosis")
plotMDS(lcpm, labels=geneExpr$samples$race, col=col.race, dim=c(3,4))
title(main="D. Race")
plotMDS(lcpm, labels=geneExpr$samples$ethnicity, col=col.ethnicity, dim=c(3,4))
title(main="E. Ethnicity")
plotMDS(lcpm, labels=geneExpr$samples$cigarettes_per_day, col=col.cigar_per_day, dim=c(3,4))
title(main="F. Cigarettes per day")
In the MDS plot, there is some separation between group of patients diagnosed wiht Adenocarcinoma and Squamous cell carcinoma. You can also see male and female patients are segregating together. However, you cannot see any distinct clusters based on Age at diagnosis, Race, Ethncity and Cigarette per day. Therefore when we contruct the design matrix, we will take sex in to account and not the other variables.
diagnosis = geneExpr$samples$group
ethnicity = geneExpr$samples$ethnicity
sex = geneExpr$samples$gender
race = geneExpr$samples$race
age_at_diagnosis = geneExpr$samples$age_at_diagnosis
smoking_hist = geneExpr$samples$cigarettes_per_day
design = model.matrix(~0+diagnosis+sex)
colnames(design) <- gsub("diagnosis", "", colnames(design))
contr.matrix <- makeContrasts(
AdenocarcinomavsSquamous_cell_carcinoma = Adenocarcinoma-Squamous_cell_carcinoma,
levels = colnames(design))
contr.matrix
Contrasts
Levels AdenocarcinomavsSquamous_cell_carcinoma
Adenocarcinoma 1
Squamous_cell_carcinoma -1
sexmale 0
par(mfrow=c(1,2))
v <- voom(geneExpr, design, plot=TRUE)
v
An object of class "EList"
$genes
21165 more rows ...
$targets
35 more rows ...
$E
TCGA-22-4613 TCGA-22-5489 TCGA-22-5491 TCGA-33-AAS8 TCGA-34-5231 TCGA-43-7656 TCGA-43-8118 TCGA-44-5645 TCGA-44-A47G TCGA-46-3765
ENSG00000000003.13 6.578393 4.518445 7.050818 5.481583 6.016010 5.138312 5.436772 5.635659 5.702467 6.113541
ENSG00000000419.11 5.006541 5.125550 6.009261 5.535389 5.227853 6.273798 5.677880 4.093028 4.281239 6.092018
ENSG00000000457.12 3.899691 4.351443 4.164341 3.347290 4.310104 4.022563 3.425331 5.427188 3.977077 3.866716
ENSG00000000460.15 3.720341 4.789466 4.070052 3.040155 4.109311 3.920045 3.333184 4.295921 2.528768 3.742487
ENSG00000000938.11 4.311517 4.032854 2.267258 2.617683 2.708592 3.026093 5.302423 2.653449 5.680948 3.611814
TCGA-46-3766 TCGA-49-4486 TCGA-49-4487 TCGA-49-AARO TCGA-50-5942 TCGA-50-5946 TCGA-50-8457 TCGA-50-8460 TCGA-55-7726 TCGA-55-8089
ENSG00000000003.13 6.379925 6.279390 5.972862 6.359283 5.745927 5.480757 5.061664 5.801764 5.618804 5.648883
ENSG00000000419.11 5.071231 5.475511 5.916025 4.660764 4.466833 5.072669 4.197116 4.935787 6.016039 5.237966
ENSG00000000457.12 3.928589 4.975262 4.130680 3.932304 4.777173 4.218002 4.293238 3.792598 3.814772 4.220320
ENSG00000000460.15 2.508825 2.713849 4.464368 2.723976 1.971207 3.939701 2.450756 1.987285 3.384872 3.679206
ENSG00000000938.11 4.778872 2.166361 5.063277 5.362237 2.672767 1.804396 4.286484 4.685892 3.749970 5.450806
TCGA-56-5897 TCGA-56-A4ZJ TCGA-56-A5DR TCGA-63-A5MH TCGA-63-A5MY TCGA-64-1676 TCGA-68-8250 TCGA-73-4662 TCGA-77-A5G7 TCGA-85-8048
ENSG00000000003.13 4.869300 5.409876 5.176682 5.836250 5.890766 7.721230 6.362591 6.141768 5.962677 6.170662
ENSG00000000419.11 5.088500 5.088331 5.362201 5.024777 6.699551 6.669445 6.051110 4.699274 6.046526 5.186354
ENSG00000000457.12 3.935572 3.573813 4.258951 4.356796 4.139408 4.442916 3.510752 5.172006 4.208445 4.329645
ENSG00000000460.15 3.653911 2.581473 4.020476 4.482014 4.332709 3.469883 3.347400 4.412238 4.220873 4.220775
ENSG00000000938.11 4.716595 5.280737 2.910954 2.161463 2.534827 4.735432 4.758486 4.545843 2.619316 5.029770
TCGA-86-7953 TCGA-86-8076 TCGA-90-7766 TCGA-91-6828 TCGA-93-A4JO TCGA-96-7545 TCGA-98-A53C TCGA-99-AA5R TCGA-NJ-A4YQ TCGA-O1-A52J
ENSG00000000003.13 5.978361 5.688321 6.386395 5.854894 6.055768 6.086418 4.356866 4.898964 5.288840 5.802251
ENSG00000000419.11 4.206129 4.600721 5.411248 4.413630 5.220789 5.534683 4.524946 4.397344 4.731263 5.020307
ENSG00000000457.12 3.843559 4.470898 4.452007 4.641835 4.296636 3.867595 3.721552 4.155948 4.791851 4.284247
ENSG00000000460.15 4.150741 2.488662 3.854044 2.833918 3.301439 2.842824 2.332691 2.017264 3.217170 3.054791
ENSG00000000938.11 4.653009 4.916381 3.879929 4.778828 5.055971 4.323060 5.647853 6.152786 4.741985 5.868203
21165 more rows ...
$weights
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14] [,15] [,16]
[1,] 1.9156382 2.265171 2.264350 2.213717 2.251193 2.259541 2.213335 2.134201 2.196382 2.178912 2.103449 2.260762 1.9845015 2.199452 2.147720 2.261012
[2,] 1.7274124 2.263898 2.259424 2.130467 2.259430 2.249206 2.129639 1.709450 1.843064 2.068303 1.956363 2.159220 1.4994892 1.850504 1.736032 2.236428
[3,] 1.1247909 1.889080 1.831902 1.519393 2.058846 1.750188 1.518138 1.492255 1.626831 1.428855 1.303977 1.918998 1.3029032 1.634603 1.518237 2.096214
[4,] 0.9981961 1.764861 1.705691 1.323075 1.957294 1.621630 1.321983 1.007585 1.085743 1.245279 1.141339 1.377043 0.9075725 1.090570 1.022096 1.590228
[5,] 1.2599568 1.638856 1.578728 1.704841 1.839755 1.497199 1.703511 1.700961 1.835126 1.608547 1.469780 1.696029 1.4916402 1.842526 1.728006 1.913948
[,17] [,18] [,19] [,20] [,21] [,22] [,23] [,24] [,25] [,26] [,27] [,28] [,29] [,30] [,31] [,32]
[1,] 2.239719 2.192143 1.912564 2.265181 2.254155 2.229398 2.217851 2.221969 2.247060 2.0726890 2.238861 2.258105 2.162523 2.207645 2.263954 2.264046
[2,] 1.982529 1.936786 1.422262 2.210388 2.240132 2.157465 2.188616 2.194131 2.229313 1.7227708 2.217553 2.217851 2.116843 2.174970 2.125441 2.182022
[3,] 1.780240 1.608988 1.237064 2.026705 1.700806 1.572091 1.526228 1.539838 1.649835 1.3955026 1.609251 2.124609 1.389669 1.493012 1.965746 1.964307
[4,] 1.189021 1.130158 0.874648 1.494592 1.571770 1.369460 1.405102 1.417681 1.522205 0.9983615 1.483421 1.549154 1.279861 1.374249 1.347076 1.422712
[5,] 1.975738 1.388865 1.414733 1.821489 1.450037 1.757968 1.295786 1.307260 1.403482 1.2056881 1.367369 2.215406 1.183066 1.267782 2.120177 1.746381
[,33] [,34] [,35] [,36] [,37] [,38] [,39] [,40]
[1,] 2.260432 2.0272364 2.252693 2.259175 2.235016 2.216680 2.171008 2.221055
[2,] 2.229632 1.6608334 2.118335 2.248536 2.170787 1.901291 1.781816 1.914839
[3,] 1.768653 1.3415540 1.847571 1.745479 1.598156 1.688464 1.563408 1.703532
[4,] 1.554026 0.9676036 1.311413 1.616765 1.392875 1.125273 1.047906 1.134973
[5,] 1.945727 1.1615881 1.619536 1.492648 1.784314 1.893762 1.773524 1.907243
21165 more rows ...
$design
Adenocarcinoma Squamous_cell_carcinoma sexmale
1 0 1 0
2 0 1 1
3 0 1 1
4 0 1 0
5 0 1 1
35 more rows ...
vfit <- lmFit(v, design)
vfit <- contrasts.fit(vfit, contrasts=contr.matrix)
efit <- eBayes(vfit)
plotSA(efit, main="Final model: Mean-variance trend")
summary(decideTests(efit))
AdenocarcinomavsSquamous_cell_carcinoma
Down 2482
NotSig 16700
Up 1988
res = topTable(efit,sort.by = "P",n=Inf)
head(res)
tfit <- treat(vfit, lfc=1)
dt <- decideTests(tfit)
summary(dt)
AdenocarcinomavsSquamous_cell_carcinoma
Down 320
NotSig 20769
Up 81
After limiting the LFC threshold to 1, there are less DEGs and we can focus on genes that are most differentially expressed.
plotMD(tfit, column=1, status=dt[,1], main=colnames(tfit)[1],
xlim=c(-8,13))
glMDPlot(tfit, coef=1, status=dt, main=colnames(tfit)[1],
side.main="gene_name", counts=lcpm, groups=geneExpr$samples$group, launch=FALSE)
library(gplots)
par(mar = rep(2, 4))
Adenocarcinoma.vs.SquamousCell.topgenes <- res$gene_name[1:100]
i <- which(v$genes$gene_name %in% Adenocarcinoma.vs.SquamousCell.topgenes)
mycol <- colorpanel(1000,"blue","white","red")
heatmap.2(lcpm[i,], scale="row",
labRow=v$genes$gene_name[i], labCol=v$targets$group,
col=mycol, trace="none", density.info="none",
margin=c(8,12), lhei=c(2,10), dendrogram="column")
library(Homo.sapiens)
geneid = v$genes$gene_name
genes = select(Homo.sapiens, keys = geneid, columns = "ENTREZID", keytype = "SYMBOL")
genes <- genes[!duplicated(genes$SYMBOL),]
genes_v = v$genes
genes_v$ensembl = rownames(genes_v)
genes_v_new = merge(genes_v,genes,by.x="gene_name",by.y="SYMBOL",sort=F)
rownames(genes_v_new) = genes_v_new$ensembl
genes_v_new = genes_v_new[,-3]
v$genes = genes_v_new
load("human_c2_v5p2.rdata")
idx <- ids2indices(Hs.c2,id=v$genes$ENTREZID)
cam.AdenovsSqua <- camera(v,idx,design,contrast = contr.matrix[,1])
barcodeplot(efit$t[,1], index=idx$AMIT_EGF_RESPONSE_20_MCF10A,
index2=idx$DORN_ADENOVIRUS_INFECTION_24HR_DN, main="Adenocarcinoma Vs Squamous Cell Carcinoma")